The data consists of vegetation % cover by functional group from across CONUS (from AIM, FIA, LANDFIRE, and RAP), as well as climate variables from DayMet, which have been aggregated into mean interannual conditions accross multiple temporal windows.
User defined parameters
print(params)
## $run
## [1] TRUE
##
## $test_run
## [1] FALSE
##
## $save_figs
## [1] FALSE
##
## $ecoregion
## [1] "forest"
##
## $response
## [1] "TotalTreeCover"
# set to true if want to run for a limited number of rows (i.e. for code testing)
test_run <- params$test_run
save_figs <- params$save_figs
response <- params$response
fit_sample <- TRUE # fit model to a sample of the data
n_train <- 5e4 # sample size of the training data
n_test <- 1e6 # sample size of the testing data (if this is too big the decile dotplot code throws memory errors)
run <- params$run
# set option so resampled dataset created here reproduces earlier runs of this code with dplyr 1.0.10
source("../../../Functions/glmTransformsIterates.R")
source("../../../Functions/transformPreds.R")
source("../../../Functions/StepBeta_mine.R")
#source("src/fig_params.R")
#source("src/modeling_functions.R")
library(ggspatial)
library(terra)
library(tidyterra)
library(sf)
library(caret)
library(tidyverse)
library(GGally) # for ggpairs()
library(pdp) # for partial dependence plots
library(gridExtra)
library(knitr)
library(patchwork) # for figure insets etc.
library(ggtext)
library(StepBeta)
theme_set(theme_classic())
library(here)
library(rsample)
library(kableExtra)
library(glmnet)
Data compiled in the prepDataForModels.R script
here::i_am("Analysis/VegComposition/ModelFitting/02_ModelFitting.Rmd")
modDat <- readRDS( here("Data_processed", "CoverData", "DataForModels_spatiallyAveraged_withSoils_noSf.rds"))
## there are some values of the annual wet degree days 5th percentile that have -Inf?? change to lowest value for now?
modDat[is.infinite(modDat$annWetDegDays_5percentile_3yrAnom), "annWetDegDays_5percentile_3yrAnom"] <- -47.8
## same, but for annual water deficit 95th percentile
modDat[is.infinite(modDat$annWaterDeficit_95percentile_3yrAnom), "annWaterDeficit_95percentile_3yrAnom"] <- -600
# # Convert total cover variables into proportions (for later use in beta regression models) ... proportions are already scaled from zero to 1
# modDat <- modDat %>%
# mutate(TotalTreeCover = TotalTreeCover/100,
# CAMCover = CAMCover/100,
# TotalHerbaceousCover = TotalHerbaceousCover/100,
# BareGroundCover = BareGroundCover/100,
# ShrubCover = ShrubCover/100
# )
# For all response variables, make sure there are no 0s add or subtract .0001 from each, since the Gamma model framework can't handle that
modDat[modDat$TotalTreeCover == 0 & !is.na(modDat$TotalTreeCover), "TotalTreeCover"] <- 0.0001
modDat[modDat$CAMCover == 0 & !is.na(modDat$CAMCover), "CAMCover"] <- 0.0001
modDat[modDat$TotalHerbaceousCover == 0 & !is.na(modDat$TotalHerbaceousCover), "TotalHerbaceousCover"] <- 0.0001
modDat[modDat$BareGroundCover == 0 & !is.na(modDat$BareGroundCover), "BareGroundCover"] <- 0.0001
modDat[modDat$ShrubCover == 0 & !is.na(modDat$ShrubCover), "ShrubCover"] <- 0.0001
modDat[modDat$BroadleavedTreeCover_prop == 0 & !is.na(modDat$BroadleavedTreeCover_prop), "BroadleavedTreeCover_prop"] <- 0.0001
modDat[modDat$NeedleLeavedTreeCover_prop == 0 & !is.na(modDat$NeedleLeavedTreeCover_prop), "NeedleLeavedTreeCover_prop"] <- 0.0001
modDat[modDat$C4Cover_prop == 0 & !is.na(modDat$C4Cover_prop), "C4Cover_prop"] <- 0.0001
modDat[modDat$C3Cover_prop == 0 & !is.na(modDat$C3Cover_prop), "C3Cover_prop"] <- 0.0001
modDat[modDat$ForbCover_prop == 0 & !is.na(modDat$ForbCover_prop), "ForbCover_prop"] <- 0.0001
#
# modDat[modDat$TotalTreeCover ==1& !is.na(modDat$TotalTreeCover), "TotalTreeCover"] <- 0.999
# modDat[modDat$CAMCover ==1& !is.na(modDat$CAMCover), "CAMCover"] <- 0.999
# modDat[modDat$TotalHerbaceousCover ==1 & !is.na(modDat$TotalHerbaceousCover), "TotalHerbaceousCover"] <- 0.999
# modDat[modDat$BareGroundCover ==1& !is.na(modDat$BareGroundCover), "BareGroundCover"] <- 0.999
# modDat[modDat$ShrubCover ==1& !is.na(modDat$ShrubCover), "ShrubCover"] <- 0.999
# modDat[modDat$BroadleavedTreeCover_prop ==1& !is.na(modDat$BroadleavedTreeCover_prop), "BroadleavedTreeCover_prop"] <- 0.999
# modDat[modDat$NeedleLeavedTreeCover_prop ==1& !is.na(modDat$NeedleLeavedTreeCover_prop), "NeedleLeavedTreeCover_prop"] <- 0.999
# modDat[modDat$C4Cover_prop ==1& !is.na(modDat$C4Cover_prop), "C4Cover_prop"] <- 0.999
# modDat[modDat$C3Cover_prop ==1& !is.na(modDat$C3Cover_prop), "C3Cover_prop"] <- 0.999
# modDat[modDat$ForbCover_prop ==1& !is.na(modDat$ForbCover_prop), "ForbCover_prop"] <- 0.999
set.seed(1234)
modDat_1 <- modDat %>%
select(-c(prcp_annTotal:annVPD_min)) %>%
# mutate(Lon = st_coordinates(.)[,1],
# Lat = st_coordinates(.)[,2]) %>%
# st_drop_geometry() %>%
# filter(!is.na(newRegion))
rename("tmin" = tmin_meanAnnAvg_CLIM,
"tmax" = tmax_meanAnnAvg_CLIM, #1
"tmean" = tmean_meanAnnAvg_CLIM,
"prcp" = prcp_meanAnnTotal_CLIM,
"t_warm" = T_warmestMonth_meanAnnAvg_CLIM,
"t_cold" = T_coldestMonth_meanAnnAvg_CLIM,
"prcp_wet" = precip_wettestMonth_meanAnnAvg_CLIM,
"prcp_dry" = precip_driestMonth_meanAnnAvg_CLIM,
"prcp_seasonality" = precip_Seasonality_meanAnnAvg_CLIM, #2
"prcpTempCorr" = PrecipTempCorr_meanAnnAvg_CLIM, #3
"abvFreezingMonth" = aboveFreezing_month_meanAnnAvg_CLIM,
"isothermality" = isothermality_meanAnnAvg_CLIM, #4
"annWatDef" = annWaterDeficit_meanAnnAvg_CLIM,
"annWetDegDays" = annWetDegDays_meanAnnAvg_CLIM,
"VPD_mean" = annVPD_mean_meanAnnAvg_CLIM,
"VPD_max" = annVPD_max_meanAnnAvg_CLIM, #5
"VPD_min" = annVPD_min_meanAnnAvg_CLIM, #6
"VPD_max_95" = annVPD_max_95percentile_CLIM,
"annWatDef_95" = annWaterDeficit_95percentile_CLIM,
"annWetDegDays_5" = annWetDegDays_5percentile_CLIM,
"frostFreeDays_5" = durationFrostFreeDays_5percentile_CLIM,
"frostFreeDays" = durationFrostFreeDays_meanAnnAvg_CLIM,
"soilDepth" = soilDepth, #7
"clay" = surfaceClay_perc,
"sand" = avgSandPerc_acrossDepth, #8
"coarse" = avgCoarsePerc_acrossDepth, #9
"carbon" = avgOrganicCarbonPerc_0_3cm, #10
"AWHC" = totalAvailableWaterHoldingCapacity,
## anomaly variables
tmean_anom = tmean_meanAnnAvg_3yrAnom, #15
tmin_anom = tmin_meanAnnAvg_3yrAnom, #16
tmax_anom = tmax_meanAnnAvg_3yrAnom, #17
prcp_anom = prcp_meanAnnTotal_3yrAnom, #18
t_warm_anom = T_warmestMonth_meanAnnAvg_3yrAnom, #19
t_cold_anom = T_coldestMonth_meanAnnAvg_3yrAnom, #20
prcp_wet_anom = precip_wettestMonth_meanAnnAvg_3yrAnom, #21
precp_dry_anom = precip_driestMonth_meanAnnAvg_3yrAnom, #22
prcp_seasonality_anom = precip_Seasonality_meanAnnAvg_3yrAnom, #23
prcpTempCorr_anom = PrecipTempCorr_meanAnnAvg_3yrAnom, #24
aboveFreezingMonth_anom = aboveFreezing_month_meanAnnAvg_3yrAnom, #25
isothermality_anom = isothermality_meanAnnAvg_3yrAnom, #26
annWatDef_anom = annWaterDeficit_meanAnnAvg_3yrAnom, #27
annWetDegDays_anom = annWetDegDays_meanAnnAvg_3yrAnom, #28
VPD_mean_anom = annVPD_mean_meanAnnAvg_3yrAnom, #29
VPD_min_anom = annVPD_min_meanAnnAvg_3yrAnom, #30
VPD_max_anom = annVPD_max_meanAnnAvg_3yrAnom, #31
VPD_max_95_anom = annVPD_max_95percentile_3yrAnom, #32
annWatDef_95_anom = annWaterDeficit_95percentile_3yrAnom, #33
annWetDegDays_5_anom = annWetDegDays_5percentile_3yrAnom , #34
frostFreeDays_5_anom = durationFrostFreeDays_5percentile_3yrAnom, #35
frostFreeDays_anom = durationFrostFreeDays_meanAnnAvg_3yrAnom #36
)
# small dataset for if testing the data
if(test_run) {
modDat_1 <- slice_sample(modDat_1, n = 1e5)
}
modDat_1[,response] <- modDat_1[,response]+1
ecoregion <- params$ecoregion
response <- params$response
print(paste0("In this model run, the ecoregion is ", ecoregion," and the response variable is ",response))
## [1] "In this model run, the ecoregion is forest and the response variable is TotalTreeCover"
if (ecoregion == "shrubGrass") {
# select data for the ecoregion of interest
modDat_1 <- modDat_1 %>%
filter(newRegion == "dryShrubGrass")
} else if (ecoregion == "forest") {
# select data for the ecoregion of interest
modDat_1 <- modDat_1 %>%
filter(newRegion %in% c("eastForest", "westForest"))
}
# remove the rows that have no observations for the response variable of interest
modDat_1 <- modDat_1[!is.na(modDat_1[,response]),]
hist(modDat_1[,response], main = paste0("Histogram of ",response),
xlab = paste0(response))
The following are the candidate predictor variables for this ecoregion:
if (ecoregion == "shrubGrass") {
# select potential predictor variables for the ecoregion of interest
prednames <-
c(
"tmean" , "prcp" ,"prcp_seasonality" ,"prcpTempCorr" ,
"isothermality" , "annWatDef" ,"sand" ,"coarse" ,
"carbon" , "AWHC" ,"tmin_anom" ,"tmax_anom" ,
"t_warm_anom" , "prcp_wet_anom" ,"precp_dry_anom" ,"prcp_seasonality_anom" ,
"prcpTempCorr_anom" , "aboveFreezingMonth_anom" ,"isothermality_anom" ,"annWatDef_anom" ,
"annWetDegDays_anom", "VPD_mean_anom" ,"VPD_min_anom" ,"frostFreeDays_5_anom" )
} else if (ecoregion == "forest") {
# select potential predictor variables for the ecoregion of interest
prednames <-
c(
"tmean" ,"prcp" , "prcp_dry" , "prcpTempCorr" ,
"isothermality" ,"annWatDef" , "clay" , "sand" ,
"coarse" ,"carbon" , "AWHC" , "tmin_anom" ,
"tmax_anom" ,"prcp_anom" , "prcp_wet_anom" , "precp_dry_anom" ,
"prcp_seasonality_anom" ,"prcpTempCorr_anom" , "aboveFreezingMonth_anom" , "isothermality_anom",
"annWatDef_anom" ,"annWetDegDays_anom" , "VPD_mean_anom" , "VPD_max_95_anom" ,
"frostFreeDays_5_anom" )
}
# subset the data to only include these predictors, and remove any remaining NAs
modDat_1 <- modDat_1 %>%
select(prednames, response, newRegion, Year, Long, Lat, NA_L1NAME, NA_L2NAME) %>%
drop_na()
names(prednames) <- prednames
df_pred <- modDat_1[, prednames]
#
# # print the list of predictor variables
# knitr::kable(format = "html", data.frame("Possible_Predictors" = prednames)
# ) %>%
# kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
create_summary <- function(df) {
df %>%
pivot_longer(cols = everything(),
names_to = 'variable') %>%
group_by(variable) %>%
summarise(across(value, .fns = list(mean = ~mean(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE),
median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE)))) %>%
mutate(across(where(is.numeric), round, 4))
}
modDat_1[prednames] %>%
create_summary() %>%
knitr::kable(caption = 'summaries of possible predictor variables') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| variable | value_mean | value_min | value_median | value_max |
|---|---|---|---|---|
| AWHC | 14.5037 | 0.9085 | 13.6556 | 34.6680 |
| VPD_max_95_anom | 0.0589 | -0.5726 | 0.0367 | 0.8326 |
| VPD_mean_anom | -0.0220 | -0.3148 | -0.0219 | 0.2651 |
| aboveFreezingMonth_anom | 0.1137 | -1.8939 | 0.0667 | 3.3667 |
| annWatDef | 34.6795 | 0.0000 | 24.7127 | 303.5579 |
| annWatDef_anom | -0.0566 | -8.9984 | -0.0757 | 1.0000 |
| annWetDegDays_anom | -0.0059 | -1.2398 | -0.0202 | 0.8050 |
| carbon | 7.9675 | 0.2171 | 4.6003 | 51.0604 |
| clay | 16.1179 | 0.0286 | 15.9399 | 83.0975 |
| coarse | 15.1012 | 0.0000 | 13.2884 | 64.1122 |
| frostFreeDays_5_anom | -25.7789 | -273.1000 | -30.0000 | 53.1000 |
| isothermality | 36.2563 | 21.4503 | 36.4876 | 59.8804 |
| isothermality_anom | 0.7314 | -8.4018 | 0.7281 | 11.7898 |
| prcp | 1112.2163 | 218.1991 | 1133.3410 | 4095.4507 |
| prcpTempCorr | -0.1548 | -0.8596 | -0.1202 | 0.7258 |
| prcpTempCorr_anom | -0.0139 | -0.5978 | -0.0017 | 0.6098 |
| prcp_anom | 0.0056 | -0.7548 | 0.0020 | 0.6720 |
| prcp_dry | 17.0445 | 0.0003 | 12.8220 | 76.7440 |
| prcp_seasonality_anom | -0.0048 | -0.6092 | 0.0030 | 0.4788 |
| prcp_wet_anom | 0.0047 | -1.4179 | 0.0141 | 0.6981 |
| precp_dry_anom | 0.0685 | -9.0000 | 0.0777 | 1.0000 |
| sand | 45.7041 | 0.0098 | 45.0641 | 99.9418 |
| tmax_anom | -0.2860 | -5.6017 | -0.2922 | 4.0197 |
| tmean | 9.4945 | -2.4480 | 8.3730 | 24.9713 |
| tmin_anom | -0.6298 | -5.7692 | -0.5744 | 2.6920 |
# response_summary <- modDat_1 %>%
# dplyr::select(#where(is.numeric), -all_of(pred_vars),
# matches(response)) %>%
# create_summary()
#
#
# kable(response_summary,
# caption = 'summaries of response variables, calculated using paint') %>%
# kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
set.seed(12011993)
# function for colors
my_fn <- function(data, mapping, method="p", use="pairwise", ...){
# grab data
x <- eval_data_col(data, mapping$x)
y <- eval_data_col(data, mapping$y)
# calculate correlation
corr <- cor(x, y, method=method, use=use)
# calculate colour based on correlation value
# Here I have set a correlation of minus one to blue,
# zero to white, and one to red
# Change this to suit: possibly extend to add as an argument of `my_fn`
colFn <- colorRampPalette(c("red", "white", "blue"), interpolate ='spline')
fill <- colFn(100)[findInterval(corr, seq(-1, 1, length=100))]
ggally_cor(data = data, mapping = mapping, size = 2.5, stars = FALSE,
digits = 2, colour = I("black"),...) +
theme_void() +
theme(panel.background = element_rect(fill=fill))
}
if (run == TRUE) {
(corrPlot <- modDat_1 %>%
select(prednames) %>%
slice_sample(n = 5e4) %>%
#select(-matches("_")) %>%
ggpairs( upper = list(continuous = my_fn, size = .1), lower = list(continuous = GGally::wrap("points", alpha = 0.1, size=0.1)), progress = FALSE))
base::saveRDS(corrPlot, paste0("../ModelFitting/models/", response, "_",ecoregion, "_corrPlot.rds"))
} else {
# corrPlot <- readRDS(paste0("../ModelFitting/models/", response, "_",ecoregion, "_corrPlot.rds"))
# (corrPlot)
print(c("See previous correlation figures"))
}
set.seed(12011993)
# vector of name of response variables
vars_response <- response
# longformat dataframes for making boxplots
df_sample_plots <- modDat_1 %>%
slice_sample(n = 5e4) %>%
rename(response = all_of(response)) %>%
mutate(response = case_when(
response <= .25 ~ ".25",
response > .25 & response <=.5 ~ ".5",
response > .5 & response <=.75 ~ ".75",
response >= .75 ~ "1",
)) %>%
select(c(response, prednames)) %>%
tidyr::pivot_longer(cols = unname(prednames),
names_to = "predictor",
values_to = "value"
)
ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
geom_boxplot() +
facet_wrap(~predictor , scales = 'free_y') +
ylab("Predictor Variable Values") +
xlab(response)
modDat_1_s <- modDat_1 %>%
mutate(across(all_of(prednames), base::scale, .names = "{.col}_s"))
names(modDat_1_s) <- c(names(modDat_1),
paste0(prednames, "_s")
)
scaleFigDat_1 <- modDat_1_s %>%
select(c(Long, Lat, Year, prednames)) %>%
pivot_longer(cols = all_of(names(prednames)),
names_to = "predNames",
values_to = "predValues_unScaled")
scaleFigDat_2 <- modDat_1_s %>%
select(c(Long, Lat, Year,paste0(prednames, "_s"))) %>%
pivot_longer(cols = all_of(paste0(prednames, "_s")),
names_to = "predNames",
values_to = "predValues_scaled",
names_sep = ) %>%
mutate(predNames = str_replace(predNames, pattern = "_s$", replacement = ""))
scaleFigDat_3 <- scaleFigDat_1 %>%
left_join(scaleFigDat_2)
ggplot(scaleFigDat_3) +
facet_wrap(~predNames, scales = "free") +
geom_histogram(aes(predValues_unScaled), fill = "lightgrey", col = "darkgrey") +
geom_histogram(aes(predValues_scaled), fill = "lightblue", col = "blue") +
xlab ("predictor variable values") +
ggtitle("Comparing the distribution of unscaled (grey) to scaled (blue) predictor variables")
## visualize the variation between groups across environmental space
## make data into spatial format
modDat_1_sf <- modDat_1 %>%
st_as_sf(coords = c("Long", "Lat"), crs = st_crs("PROJCRS[\"unnamed\",\n BASEGEOGCRS[\"unknown\",\n DATUM[\"unknown\",\n ELLIPSOID[\"Spheroid\",6378137,298.257223563,\n LENGTHUNIT[\"metre\",1,\n ID[\"EPSG\",9001]]]],\n PRIMEM[\"Greenwich\",0,\n ANGLEUNIT[\"degree\",0.0174532925199433,\n ID[\"EPSG\",9122]]]],\n CONVERSION[\"Lambert Conic Conformal (2SP)\",\n METHOD[\"Lambert Conic Conformal (2SP)\",\n ID[\"EPSG\",9802]],\n PARAMETER[\"Latitude of false origin\",42.5,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8821]],\n PARAMETER[\"Longitude of false origin\",-100,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8822]],\n PARAMETER[\"Latitude of 1st standard parallel\",25,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8823]],\n PARAMETER[\"Latitude of 2nd standard parallel\",60,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8824]],\n PARAMETER[\"Easting at false origin\",0,\n LENGTHUNIT[\"metre\",1],\n ID[\"EPSG\",8826]],\n PARAMETER[\"Northing at false origin\",0,\n LENGTHUNIT[\"metre\",1],\n ID[\"EPSG\",8827]]],\n CS[Cartesian,2],\n AXIS[\"easting\",east,\n ORDER[1],\n LENGTHUNIT[\"metre\",1,\n ID[\"EPSG\",9001]]],\n AXIS[\"northing\",north,\n ORDER[2],\n LENGTHUNIT[\"metre\",1,\n ID[\"EPSG\",9001]]]]"))
## do a pca of climate across level 2 ecoregions
pca <- prcomp(modDat_1_s[,paste0(prednames, "_s")])
library(factoextra)
(fviz_pca_ind(pca, habillage = modDat_1_s$NA_L2NAME, label = "none", addEllipses = TRUE, ellipse.level = .95, ggtheme = theme_minimal(), alpha.ind = .1))
if(ecoregion == "shrubGrass") {
print("We'll combine the 'Mediterranean California' and 'Western Sierra Madre Piedmont' ecoregions (into 'Mediterranean Piedmont'). We'll also combine `Tamaulipas-Texas semiarid plain' and 'South Central semiarid prairies' ecoregions (into (`Semiarid plain and prairies`)" )
modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MEDITERRANEAN CALIFORNIA", "WESTERN SIERRA MADRE PIEDMONT"), "NA_L2NAME"] <- "MEDITERRANEAN PIEDMONT"
modDat_1[modDat_1$NA_L2NAME %in% c("MEDITERRANEAN CALIFORNIA", "WESTERN SIERRA MADRE PIEDMONT"), "NA_L2NAME"] <- "MEDITERRANEAN PIEDMONT"
modDat_1_s[modDat_1_s$NA_L2NAME %in% c("TAMAULIPAS-TEXAS SEMIARID PLAIN", "SOUTH CENTRAL SEMIARID PRAIRIES"), "NA_L2NAME"] <- "SEMIARID PLAIN AND PRAIRIES"
modDat_1[modDat_1$NA_L2NAME %in% c("TAMAULIPAS-TEXAS SEMIARID PLAIN", "SOUTH CENTRAL SEMIARID PRAIRIES"), "NA_L2NAME"] <- "SEMIARID PLAIN AND PRAIRIES"
}
# make a table of n for each region
modDat_1 %>%
group_by(NA_L2NAME) %>%
dplyr::summarize("Number_Of_Observations" = length(NA_L2NAME)) %>%
rename("Level_2_Ecoregion" = NA_L2NAME)%>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Level_2_Ecoregion | Number_Of_Observations |
|---|---|
| ATLANTIC HIGHLANDS | 5686 |
| CENTRAL USA PLAINS | 1253 |
| EVERGLADES | 211 |
| MARINE WEST COAST FOREST | 7457 |
| MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS | 9726 |
| MIXED WOOD PLAINS | 8151 |
| MIXED WOOD SHIELD | 7960 |
| OZARK/OUACHITA-APPALACHIAN FORESTS | 15497 |
| SOUTHEASTERN USA PLAINS | 23498 |
| UPPER GILA MOUNTAINS | 9288 |
| WESTERN CORDILLERA | 68909 |
# download map info for visualization
us_states <- suppressMessages(tigris::states())
cropped_states <- suppressMessages(us_states %>%
dplyr::filter(NAME!="Hawaii") %>%
dplyr::filter(NAME!="Alaska") %>%
dplyr::filter(NAME!="Puerto Rico") %>%
dplyr::filter(NAME!="American Samoa") %>%
dplyr::filter(NAME!="Guam") %>%
dplyr::filter(NAME!="Commonwealth of the Northern Mariana Islands") %>%
dplyr::filter(NAME!="United States Virgin Islands") %>%
sf::st_sf() %>%
sf::st_transform(sf::st_crs(modDat_1_sf))) #%>%
#sf::st_crop(sf::st_bbox(modDat_1_sf)+c(-1,-1,1,1))
map1 <- ggplot() +
geom_sf(data=cropped_states,fill='white') +
geom_sf(data=modDat_1_sf,aes(fill=as.factor(NA_L2NAME)),linewidth=0.5,alpha=0.5) +
geom_point(data=modDat_1,alpha=0.5,
aes(x = Long, y = Lat, color=as.factor(NA_L2NAME))) +
#scale_fill_okabeito() +
#scale_color_okabeito() +
# theme_default() +
theme(legend.position = 'none') +
labs(title = "Level 2 Ecoregions as spatial blocks")
hull <- modDat_1_sf %>%
ungroup() %>%
group_by(NA_L2NAME) %>%
slice(chull(tmean, prcp))
plot1<-ggplot(data=modDat_1_sf,aes(x=tmean,y=prcp)) +
geom_polygon(data = hull, alpha = 0.25,aes(fill=NA_L2NAME) )+
geom_point(aes(group=NA_L2NAME,color=NA_L2NAME),alpha=0.25) +
theme_minimal() + xlab("Annual Average T_mean - long-term average") +
ylab("Annual Average Precip - long-term average") #+
#scale_color_okabeito() +
#scale_fill_okabeito()
plot2<-ggplot(data=modDat_1_sf %>%
pivot_longer(cols=tmean:prcp),
aes(x=value,group=name)) +
# geom_polygon(data = hull, alpha = 0.25,aes(fill=fold) )+
geom_density(aes(group=NA_L2NAME,fill=NA_L2NAME),alpha=0.25) +
theme_minimal() +
facet_wrap(~name,scales='free')# +
#scale_color_okabeito() +
#scale_fill_okabeito()
library(patchwork)
(combo <- (map1+plot1)/plot2)
## Currently, removing data from the “Texas Coastal Plain”, since it’s
quite different from other regions and is really messing with model
fit
modDat_1 <- modDat_1 %>%
filter(NA_L2NAME != "TEXAS-LOUISIANA COASTAL PLAIN")
modDat_1_s <- modDat_1_s %>%
filter(NA_L2NAME != "TEXAS-LOUISIANA COASTAL PLAIN")
##
## Call: cv.glmnet(x = X[, 2:ncol(X)], y = y, type.measure = "mse", foldid = my_folds, keep = TRUE, parallel = TRUE, family = stats::Gamma(link = "log"), alpha = 1, nlambda = 100, standardize = FALSE)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.00820 45 1028 100.91 101
## 1se 0.06965 22 1120 96.18 15
Then, fit regular glm models (Gamma glm with a log link), first using the coefficients from the ‘best’ lambda identified in the LASSO model, as then using the coefficients from the ‘1SE’ lambda identified from the LASSO (this is the value of lambda such that the cross validation error is within 1 standard error of the minimum).
## fit w/ the identified coefficients from the 'best' lambda, but using the glm function
mat_glmnet_best <- as.matrix(bestLambda_coef)
mat2_glmnet_best <- mat_glmnet_best[mat_glmnet_best[,1] != 0,]
names(mat2_glmnet_best) <- rownames(mat_glmnet_best)[mat_glmnet_best[,1] != 0]
if(length(mat2_glmnet_best) == 1) {
f_glm_bestLambda <- as.formula(paste0(response, "~ 1"))
} else {
f_glm_bestLambda <- as.formula(paste0(response, " ~ ", paste0(names( mat2_glmnet_best)[2:length(names( mat2_glmnet_best))], collapse = " + ")))
}
fit_glm_bestLambda <- glm(data = modDat_1_s
, formula =
f_glm_bestLambda,
,family = stats::Gamma(link = "log")
)
## fit w/ the identified coefficients from the '1se' lambda, but using the glm function
mat_glmnet_1se <- as.matrix(seLambda_coef)
mat2_glmnet_1se <- mat_glmnet_1se[mat_glmnet_1se[,1] != 0,]
names(mat2_glmnet_1se) <- rownames(mat_glmnet_1se)[mat_glmnet_1se[,1] != 0]
if(length(mat2_glmnet_1se) == 1) {
f_glm_1se <- as.formula(paste0(response, "~ 1"))
} else {
f_glm_1se <- as.formula(paste0(response, " ~ ", paste0(names( mat2_glmnet_1se)[2:length(names( mat2_glmnet_1se))], collapse = " + ")))
}
fit_glm_se <- glm(data = modDat_1_s, formula =
f_glm_1se,
,family = stats::Gamma(link = "log")
)
Then, we predict (on the training set) using both of these models (best lambda and 1se lambda)
## predict on the test data
# lasso model predictions with the optimal lambda
optimal_pred <- predict(fit_glm_bestLambda, newx=X[,2:ncol(X)], type = "response")
optimal_pred_1se <- predict(fit_glm_se, newx=X[,2:ncol(X)], type = "response")
null_fit <- glm(#data = data.frame("y" = y, X[,paste0(prednames, "_s")]),
formula = y ~ 1, family = stats::Gamma(link = "log"))
null_pred <- predict(null_fit, newdata = as.data.frame(X), type = "response"
)
# save data
fullModOut <- list(
"modelObject" = fit,
"nullModelObject" = null_fit,
"modelPredictions" = data.frame(#ecoRegion_holdout = rep(test_eco,length(y)),
obs=y,
pred_opt=optimal_pred,
pred_opt_se = optimal_pred_1se,
pred_null=null_pred#,
#pred_nopenalty=nopen_pred
))
# calculate correlations between null and optimal model
my_cors <- c(cor(optimal_pred, y),
cor(optimal_pred_1se, y),
cor(null_pred, y)
)
# calculate mse between null and optimal model
my_mse <- c(mean((fullModOut$modelPredictions$pred_opt - y)^2) ,
mean((fullModOut$modelPredictions$pred_opt_se - y)^2) ,
mean((fullModOut$modelPredictions$pred_null - y)^2)#,
#mean((obs_pred$pred_nopenalty - obs_pred$obs)^2)
)
ggplot() +
geom_point(aes(X[,2], fullModOut$modelPredictions$obs), col = "black", alpha = .1) +
geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt), col = "red", alpha = .1) + ## predictions w/ the CV model
geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt_se), col = "green", alpha = .1) + ## predictions w/ the CV model (1se lambda)
geom_point(aes(X[,2], fullModOut$modelPredictions$pred_null), col = "blue", alpha = .1) +
labs(title = "A rough comparison of observed and model-predicted values",
subtitle = "black = observed values \n red = predictions from 'best lambda' model \n green = predictions from '1se' lambda model \n blue = predictions from null model") +
xlab(colnames(X)[2])
#ylim(c(0,200))
The internal cross-validation process to fit the global LASSO model
identified an optimal lambda value (regularization parameter) of
r{print(best_lambda)}. The lambda value such that the cross
validation error is within 1 standard error of the minimum (“1se
lambda”) was `r{print(fit$lambda.1se)}`` . The following coefficients
were kept in each model:
# the coefficient matrix from the 'best model' -- find and print those coefficients that aren't 0 in a table
coef_glm_bestLambda <- coef(fit_glm_bestLambda) %>%
data.frame()
coef_glm_bestLambda$coefficientName <- rownames(coef_glm_bestLambda)
names(coef_glm_bestLambda)[1] <- "coefficientValue_bestLambda"
# coefficient matrix from the '1se' model
coef_glm_1se <- coef(fit_glm_se) %>%
data.frame()
coef_glm_1se$coefficientName <- rownames(coef_glm_1se)
names(coef_glm_1se)[1] <- "coefficientValue_1seLambda"
# add together
coefs <- full_join(coef_glm_bestLambda, coef_glm_1se) %>%
select(coefficientName, coefficientValue_bestLambda, coefficientValue_1seLambda)
globModTerms <- coefs[!is.na(coefs$coefficientValue_bestLambda), "coefficientName"]
kable(coefs, col.names = c("Coefficient Name", "Value from best lambda model", "Value from 1se lambda model")
) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Coefficient Name | Value from best lambda model | Value from 1se lambda model |
|---|---|---|
| (Intercept) | 3.8592787 | 3.8280945 |
| tmean_s | 0.2855779 | 0.1340444 |
| prcp_s | 0.1502842 | 0.1570040 |
| prcp_dry_s | 0.1220588 | 0.2474862 |
| annWatDef_s | -0.2243614 | NA |
| clay_s | -0.1381028 | NA |
| sand_s | -0.0508933 | NA |
| carbon_s | 0.0531209 | NA |
| tmax_anom_s | -0.0407801 | NA |
| prcp_anom_s | -0.0345585 | -0.0807579 |
| prcp_seasonality_anom_s | 0.0470745 | 0.0614220 |
| prcpTempCorr_anom_s | -0.0186045 | NA |
| isothermality_anom_s | 0.0952201 | NA |
| VPD_mean_anom_s | -0.0397047 | NA |
| VPD_max_95_anom_s | 0.0715954 | NA |
| frostFreeDays_5_anom_s | 0.0185514 | NA |
| I(tmean_s^2) | -0.1050637 | NA |
| I(prcp_s^2) | -0.0478293 | -0.0416904 |
| I(isothermality_s^2) | -0.0445844 | NA |
| I(tmax_anom_s^2) | 0.0039963 | NA |
| I(precp_dry_anom_s^2) | 0.0007362 | 0.0025346 |
| I(prcpTempCorr_anom_s^2) | 0.0071635 | NA |
| I(aboveFreezingMonth_anom_s^2) | 0.0096640 | NA |
| I(annWatDef_anom_s^2) | 0.0042549 | 0.0037374 |
| I(annWetDegDays_anom_s^2) | -0.0023091 | -0.0149790 |
| I(VPD_mean_anom_s^2) | -0.0096312 | NA |
| I(frostFreeDays_5_anom_s^2) | -0.0014454 | -0.0177705 |
| I(clay_s^2) | 0.0121212 | -0.0300890 |
| I(carbon_s^2) | -0.0292388 | NA |
| I(AWHC_s^2) | -0.0171356 | NA |
| isothermality_anom_s:aboveFreezingMonth_anom_s | 0.0245367 | NA |
| isothermality_anom_s:annWatDef_anom_s | -0.0103998 | NA |
| annWatDef_anom_s:isothermality_s | -0.0358267 | NA |
| prcp_s:annWatDef_anom_s | 0.0326050 | NA |
| prcpTempCorr_anom_s:annWatDef_anom_s | -0.0106727 | NA |
| tmean_s:annWatDef_anom_s | 0.0481380 | NA |
| VPD_max_95_anom_s:annWatDef_anom_s | -0.0118607 | NA |
| annWatDef_s:annWetDegDays_anom_s | 0.0027974 | NA |
| annWatDef_s:frostFreeDays_5_anom_s | -0.0041169 | NA |
| prcp_s:annWatDef_s | 0.1083540 | 0.1920231 |
| annWatDef_s:prcp_seasonality_anom_s | 0.0279452 | NA |
| annWatDef_s:prcpTempCorr_s | -0.0966881 | NA |
| annWatDef_s:precp_dry_anom_s | 0.0003373 | NA |
| annWatDef_s:tmax_anom_s | -0.0270437 | NA |
| tmean_s:annWatDef_s | 0.0920123 | NA |
| annWatDef_s:VPD_mean_anom_s | 0.0047418 | NA |
| prcp_anom_s:annWetDegDays_anom_s | 0.0071696 | NA |
| prcp_s:annWetDegDays_anom_s | 0.0369569 | NA |
| annWetDegDays_anom_s:prcp_wet_anom_s | -0.0228664 | NA |
| annWetDegDays_anom_s:prcpTempCorr_s | 0.0106889 | NA |
| annWetDegDays_anom_s:tmin_anom_s | 0.0092174 | NA |
| VPD_max_95_anom_s:annWetDegDays_anom_s | 0.0107765 | NA |
| isothermality_anom_s:frostFreeDays_5_anom_s | 0.0165818 | NA |
| frostFreeDays_5_anom_s:isothermality_s | -0.0161448 | NA |
| prcp_seasonality_anom_s:frostFreeDays_5_anom_s | 0.0123090 | NA |
| frostFreeDays_5_anom_s:prcpTempCorr_s | -0.0439698 | NA |
| tmean_s:frostFreeDays_5_anom_s | -0.0019913 | NA |
| frostFreeDays_5_anom_s:tmin_anom_s | -0.0213892 | NA |
| VPD_max_95_anom_s:frostFreeDays_5_anom_s | -0.0138506 | NA |
| isothermality_anom_s:isothermality_s | -0.0470516 | NA |
| prcp_s:isothermality_anom_s | -0.0016403 | 0.0648008 |
| prcp_seasonality_anom_s:isothermality_anom_s | -0.0159136 | NA |
| isothermality_anom_s:prcpTempCorr_s | -0.1031717 | NA |
| isothermality_anom_s:precp_dry_anom_s | 0.0227359 | NA |
| tmean_s:isothermality_anom_s | 0.0356437 | NA |
| prcp_anom_s:isothermality_s | 0.0362947 | NA |
| prcp_s:isothermality_s | -0.0012237 | NA |
| prcp_seasonality_anom_s:isothermality_s | -0.0207171 | NA |
| tmean_s:isothermality_s | 0.1033340 | NA |
| isothermality_s:tmin_anom_s | -0.0312114 | NA |
| prcp_anom_s:prcpTempCorr_s | 0.0387347 | NA |
| tmax_anom_s:prcp_anom_s | -0.0199611 | NA |
| prcp_anom_s:VPD_max_95_anom_s | 0.0161879 | NA |
| prcp_s:prcp_dry_s | -0.0006629 | NA |
| prcp_dry_s:prcp_seasonality_anom_s | -0.0468498 | NA |
| prcp_dry_s:prcpTempCorr_anom_s | 0.0236576 | NA |
| prcp_dry_s:VPD_mean_anom_s | 0.0082921 | NA |
| prcp_s:prcp_seasonality_anom_s | 0.0319575 | NA |
| prcp_s:prcp_wet_anom_s | 0.0334289 | NA |
| prcp_s:tmin_anom_s | 0.0151263 | NA |
| prcp_s:VPD_max_95_anom_s | -0.0651954 | NA |
| prcp_seasonality_anom_s:prcp_wet_anom_s | 0.0129690 | NA |
| prcp_seasonality_anom_s:prcpTempCorr_s | -0.0111604 | NA |
| tmax_anom_s:prcp_seasonality_anom_s | 0.0359490 | NA |
| prcp_seasonality_anom_s:tmin_anom_s | 0.0023425 | NA |
| prcp_seasonality_anom_s:VPD_max_95_anom_s | -0.0231415 | NA |
| precp_dry_anom_s:prcp_wet_anom_s | 0.0206864 | NA |
| prcpTempCorr_anom_s:prcpTempCorr_s | 0.0341053 | NA |
| prcpTempCorr_anom_s:precp_dry_anom_s | 0.0090254 | NA |
| tmean_s:prcpTempCorr_anom_s | 0.0135326 | NA |
| tmean_s:prcpTempCorr_s | -0.0601102 | NA |
| prcpTempCorr_s:tmin_anom_s | -0.0388170 | NA |
| VPD_mean_anom_s:prcpTempCorr_s | 0.0597114 | NA |
| tmax_anom_s:precp_dry_anom_s | 0.0037353 | NA |
| precp_dry_anom_s:tmin_anom_s | 0.0234534 | NA |
| VPD_max_95_anom_s:tmin_anom_s | -0.0230298 | NA |
| carbon_s:AWHC_s | -0.0252506 | NA |
| clay_s:AWHC_s | -0.0314509 | NA |
| clay_s:carbon_s | -0.1152228 | -0.1395333 |
| carbon_s:coarse_s | 0.0400878 | NA |
| sand_s:carbon_s | 0.0188409 | NA |
| clay_s:coarse_s | 0.0394824 | NA |
| prcpTempCorr_s:isothermality_s | NA | -0.0793560 |
# create prediction for each each model
# (i.e. for each fire proporation variable)
predict_by_response <- function(mod, df) {
df_out <- df
response_name <- paste0(response, "_pred")
df_out <- df_out %>% cbind(predict(mod, newx= df_out, #s="lambda.min",
type = "response"))
colnames(df_out)[ncol(df_out)] <- response_name
return(df_out)
}
pred_glm1 <- predict_by_response(fit_glm_bestLambda, X[,2:ncol(X)])
# add back in true y values
pred_glm1 <- pred_glm1 %>%
cbind( data.frame("y" = y))
# rename the true response column to not be 'y_test'
colnames(pred_glm1)[which(colnames(pred_glm1) == "y")] <- paste(response)
# add back in lat/long data
pred_glm1 <- pred_glm1 %>%
cbind(modDat_1_s[,c("Long", "Lat", "Year")])
pred_glm1$resid <- pred_glm1[,response] - pred_glm1[,paste0(response, "_pred")]
pred_glm1$extremeResid <- NA
pred_glm1[pred_glm1$resid > 70 | pred_glm1$resid < -70,"extremeResid"] <- 1
# plot(x = pred_glm1[,response],
# y = pred_glm1[,paste0(response, "_pred")],
# xlab = "observed values", ylab = "predicted values")
# points(x = pred_glm1[!is.na(pred_glm1$extremeResid), response],
# y = pred_glm1[!is.na(pred_glm1$extremeResid), paste0(response, "_pred")],
# col = "red")
Observations across the temporal range of the dataset
pred_glm1 <- pred_glm1 %>%
mutate(resid = .[[response]] - .[[paste0(response,"_pred")]])
# rasterize
# get reference raster
test_rast <- rast("../../../Data_raw/dayMet/rawMonthlyData/orders/70e0da02b9d2d6e8faa8c97d211f3546/Daymet_Monthly_V4R1/data/daymet_v4_prcp_monttl_na_1980.tif") %>%
terra::aggregate(fact = 8, fun = "mean")
## |---------|---------|---------|---------|=========================================
# rasterize data
plotObs <- pred_glm1 %>%
drop_na(paste(response)) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = response,
fun = mean) #%>%
#terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
#terra::crop(ext(-1950000, 1000000, -1800000, 1000000))
# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotObs, na.rm = TRUE)
plotObs_2 <- plotObs %>%
crop(ext(min(tempExt[,1]), max(tempExt[,1]),
min(tempExt[,2]), max(tempExt[,2]))
)
# make figures
ggplot() +
geom_spatraster(data = plotObs_2) +
geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
labs(title = paste0("Observations of ", response, " in the ",ecoregion, " ecoregion"),
subtitle = "bestLambda model") +
scale_fill_gradient2(low = "brown",
mid = "wheat" ,
high = "darkgreen" ,
midpoint = 0, na.value = "lightgrey")
Predictions across the temporal range of the dataset
# rasterize data
plotPred <- pred_glm1 %>%
drop_na(paste0(response,"_pred")) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = paste0(response,"_pred"),
fun = mean) #%>%
#terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
#terra::crop(ext(-1950000, 1000000, -1800000, 1000000))
# get the point location of those predictions that are > 100
highPred_points <- pred_glm1 %>%
filter(.[[paste0(response,"_pred")]] > 100 |
.[[paste0(response, "_pred")]] < 0) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotPred, na.rm = TRUE)
plotPred_2 <- plotPred %>%
crop(ext(min(tempExt[,1]), max(tempExt[,1]),
min(tempExt[,2]), max(tempExt[,2]))
)
# make figures
ggplot() +
geom_spatraster(data = plotPred_2) +
geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) + geom_sf(data = highPred_points, col = "red") +
labs(title = paste0("Predictions from the fitted model of ", response, " in the ",ecoregion, " ecoregion"),
subtitle = "bestLambda model") +
scale_fill_gradient2(low = "wheat",
mid = "darkgreen",
high = "red" ,
midpoint = 100, na.value = "lightgrey",
limits = c(0,100))
Residuals across the entire temporal extent of the dataset
# rasterize data
plotResid_rast <- pred_glm1 %>%
drop_na(resid) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = "resid",
fun = mean) #%>%
#terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
#terra::crop(ext(-1950000, 1000000, -1800000, 1000000))
# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotResid_rast, na.rm = TRUE)
plotResid_rast_2 <- plotResid_rast %>%
crop(ext(min(tempExt[,1]), max(tempExt[,1]),
min(tempExt[,2]), max(tempExt[,2]))
)
# identify locations where residuals are >100 or < -100
badResids_high <- pred_glm1 %>%
filter(resid > 100) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
badResids_low <- pred_glm1 %>%
filter(resid < -100) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
# make figures
map <- ggplot() +
geom_spatraster(data =
plotResid_rast_2) +
geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
geom_sf(data = badResids_high, col = "blue") +
geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Resids. (obs. - pred.) from Grass/shrub ecoregion-wide model of ", response),
subtitle = "bestLambda model \n red points indication locations that have residuals below -100 \n blue points indicate locatiosn that have residuals above 100") +
scale_fill_gradient2(low = "red",
mid = "white" ,
high = "blue" ,
midpoint = 0, na.value = "lightgrey",
limits = c(-100,100)
)
hist <- ggplot(pred_glm1) +
geom_histogram(aes(resid), fill = "lightgrey", col = "darkgrey") +
geom_text(aes(x = min(resid)*.9, y = 1500, label = paste0("min = ", round(min(resid),2)))) +
geom_text(aes(x = max(resid)*.9, y = 1500, label = paste0("max = ", round(max(resid),2))))
library(ggpubr)
ggarrange(map, hist, heights = c(3,1), ncol = 1)
Binning predictor variables into “Deciles” (actually percentiles) and looking at the mean predicted probability for each percentile. The use of the word Decentiles is just a legacy thing (they started out being actual Percentiles)
var_prop_pred <- paste0(response, "_pred")
response_vars <- c(response, var_prop_pred)
prednames_fig <- paste(str_split(globModTerms, ":", simplify = TRUE))
prednames_fig <- str_replace(prednames_fig, "I\\(", "")
prednames_fig <- str_replace(prednames_fig, "\\^2\\)", "")
prednames_fig <- unique(prednames_fig[prednames_fig>0])
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles <- predvars2deciles(pred_glm1,
response_vars = response_vars,
pred_vars = prednames_fig)
}
Here is a quick version of LOESS curves fit to raw data (to double-check the quantile plot calculations)
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1 %>%
select(all_of(c(prednames_fig, response_vars))) %>%
pivot_longer(cols = prednames_fig) %>%
ggplot() +
facet_wrap(~name, scales = "free") +
geom_point(aes(x = value, y = .data[[paste(response)]]), col = "darkblue", alpha = .1) + # observed values
geom_point(aes(x = value, y = .data[[response_vars[2]]]), col = "lightblue", alpha = .1) + # model-predicted values
geom_smooth(aes(x = value, y = .data[[paste(response)]]), col = "black", se = FALSE) +
geom_smooth(aes(x = value, y = .data[[response_vars[2]]]), col = "brown", se = FALSE)
}
Below are the actual quantile plots (note that the predictor variables are scaled)
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles, response = response) + ggtitle("Decile Plot")
g4 <- add_dotplot_inset(g3, pred_glm1_deciles)
if(save_figs) {
png(paste0("figures/quantile_plots/quantile_plot_", response, "_",ecoregion,".png"),
units = "in", res = 600, width = 5.5, height = 3.5 )
print(g4)
dev.off()
}
g4
}
20th and 80th percentiles for each climate variable
df <- pred_glm1[, prednames_fig] #%>%
#mutate(MAT = MAT - 273.15) # k to c
quantiles <- map(df, quantile, probs = c(0.2, 0.8), na.rm = TRUE)
Filtered ‘Decile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower two deciles of each predictor variable.
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1,
response_vars = response_vars,
pred_vars = prednames_fig,
filter_var = TRUE,
filter_vars = prednames_fig)
decile_dotplot_filtered_pq(pred_glm1_deciles_filt, xvars = prednames_fig)
#decile_dotplot_filtered_pq(pred_glm1_deciles_filt)
}
Filtered quantile figure with middle 2 deciles also shown
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1,
response_vars = response_vars,
pred_vars = prednames_fig,
filter_vars = prednames_fig,
filter_var = TRUE,
add_mid = TRUE)
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, xvars = prednames_fig)
g
if(save_figs) {x
jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
units = "in", res = 600, width = 5.5, height = 6 )
g
dev.off()
}
}
These models were fit to six ecoregions, and then predict on the indicated heldout ecoregion
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {
## code from Tredennick et al. 2020
# try each separate level II ecoregion as a test set
# make a list to hold output data
outList <- vector(mode = "list", length = length(sort(unique(modDat_1$NA_L2NAME))))
# obs_pred <- data.frame(ecoregion = character(),obs = numeric(),
# pred_opt = numeric(), pred_null = numeric()#,
# #pred_nopenalty = numeric()
# )
## get the model specification from the global model
mat <- as.matrix(coef(fit_glm_bestLambda, s = "lambda.min"))
mat2 <- mat[mat[,1] != 0,]
f_cv <- as.formula(paste0(response, " ~ ", paste0(names(mat2)[2:length(names(mat2))], collapse = " + ")))
X_cv <- model.matrix(object = f_cv, data = modDat_1_s)
# get response variable
y_cv <- as.matrix(modDat_1_s[,response])
# now, loop through so with each iteration, a different ecoregion is held out
for(i_eco in sort(unique(modDat_1_s$NA_L2NAME))){
# split into training and test sets
test_eco <- i_eco
print(test_eco)
# identify the rowID of observations to be in the training and test datasets
train <- which(modDat_1_s$NA_L2NAME!=test_eco) # data for all ecoregions that aren't 'i_eco'
test <- which(modDat_1_s$NA_L2NAME==test_eco) # data for the ecoregion that is 'i_eco'
trainDat_all <- modDat_1_s %>%
slice(train) %>%
select(-newRegion)
testDat_all <- modDat_1_s %>%
slice(test) %>%
select(-newRegion)
# get the model matrices for input and response variables for cross validation model specification
X_train <- as.matrix(X_cv[train,])
X_test <- as.matrix(X_cv[test,])
y_train <- modDat_1_s[train,response]
y_test <- modDat_1_s[test,response]
# get the model matrices for input and response variables for original model specification
X_train_glob <- as.matrix(X[train,])
X_test_glob <- as.matrix(X[test,])
y_train_glob <- modDat_1_s[train,response]
y_test_glob <- modDat_1_s[test,response]
train_eco <- modDat_1_s$NA_L2NAME[train]
# Fit model using glm-----------------------------------------------
#
# # specify leave-one-year-out cross-validation
# my_folds <- as.numeric(as.factor(train_eco))
#
# fit_i <- glmnet(
# x = X_train[,2:ncol(X_train)],
# y = y_train,
# #family = gaussian,
# family = stats::Gamma(link = "log"),
# alpha = 1, # 0 == ridge regression, 1 == lasso, 0.5 ~~ elastic net
# #lambda = lambdas,
# #type.measure="mse",
# #penalty.factor = pen_facts,
# foldid = my_folds,
# standardize = TRUE
# )
## just try a regular glm w/ the components from the global model
fit_i <- glm(data = trainDat_all, formula = f_cv,
# data = trainDat_all,
# formula = TotalTreeCover ~ prcp_s + prcp_seasonality_s + prcpTempCorr_s +
# carbon_s + AWHC_s + t_warm_anom_s + prcp_wet_anom_s + annWetDegDays_anom_s +
# VPD_min_anom_s + I(prcp_seasonality_s^2) + I(prcpTempCorr_s^2) +
# I(isothermality_s^2) + I(annWatDef_s^2) + I(tmin_anom_s^2) +
# I(precp_dry_anom_s^2) + I(prcpTempCorr_anom_s^2) + I(annWatDef_anom_s^2) +
# I(VPD_min_anom_s^2) + I(sand_s^2) + I(AWHC_s^2) + annWatDef_s:annWetDegDays_anom_s +
# prcpTempCorr_s:annWatDef_s + prcp_seasonality_anom_s:annWetDegDays_anom_s +
# prcpTempCorr_anom_s:annWetDegDays_anom_s + prcpTempCorr_s:annWetDegDays_anom_s +
# tmean_s:frostFreeDays_5_anom_s + prcp_s:isothermality_s +
# t_warm_anom_s:prcp_seasonality_anom_s + tmin_anom_s:t_warm_anom_s +
# sand_s:AWHC_s
,
family = stats::Gamma(link = "log")
)
coef(fit_i)
# lasso model predictions with the optimal lambda
optimal_pred <- predict(fit_i, newdata= testDat_all, type = "response"
)
# null model and predictions
# the "null" model in this case is the global model
# predict on the test data for this iteration w/ the global model
null_pred <- predict.glm(fit_glm_bestLambda, newdata = testDat_all, type = "response")
# save data
tmp <- data.frame(ecoRegion_holdout = rep(test_eco,length(y_test)),obs=y_test,
pred_opt=optimal_pred, pred_null=null_pred#,
#pred_nopenalty=nopen_pred
) %>%
cbind(testDat_all)
# put output into a list
tmpList <- list("testRegion" = i_eco,
"modelObject" = fit_i,
"modelPredictions" = tmp)
# save model outputs
outList[[which(sort(unique(modDat_1_s$NA_L2NAME)) == i_eco)]] <- tmpList
# save one example of model fits
# if(i_eco==sort(unique(modDat_1_s$NA_L2NAME))[1]){
# # save model object
# saveRDS(fit,file="./ModelResults/glmnet_fit.RDS")
#
# #saveRDS(nopen,file="./ModelResults/nopenalty_fit.RDS")
# }
}
#
# # calculate correlations between null and optimal model
# my_cors <- c(cor(outList[[i]]$modelPredictions$pred_opt, outList[[i]]$modelPredictions$obs), # correlation of LOO model's predictions w/ the observations
# cor(outList[[i]]$modelPredictions$s1, outList[[i]]$modelPredictions$obs)#, # correlations of LOO model's predictions w/ global model's predictions
# #cor(obs_pred$pred_nopenalty,obs_pred$obs)
# )
#
# # calculate mse between null and optimal model
# my_mse <- c(mean((outList[[i]]$modelPredictions$pred_opt - outList[[i]]$modelPredictions$obs)^2) ,
# mean((outList[[i]]$modelPredictions$s1 - outList[[i]]$modelPredictions$obs)^2)#,
# #mean((obs_pred$pred_nopenalty - obs_pred$obs)^2)
# )
# # print results
# names(my_cors) <- names(my_mse) <- c("optimal","null"#,"no penalty"
# )
# print(my_cors)
# print(my_mse)
#
# # save obs vs pred results
# write.csv(obs_pred,"./ModelResults/obs_vs_pred.csv",row.names=F)
for (i in 1:length(unique(modDat_1_s$NA_L2NAME))) {
holdoutRegion <- outList[[i]]$testRegion
predictionData <- outList[[i]]$modelPredictions
modTerms <- as.matrix(coef(outList[[i]]$modelObject)) %>%
as.data.frame() %>%
filter(V1!=0) %>%
rownames()
# calculate residuals
predictionData <- predictionData %>%
mutate(resid = .[["obs"]] - .[["pred_opt"]] ,
resid_globMod = .[["obs"]] - .[["pred_null"]])
# rasterize
# use 'test_rast' from earlier
# rasterize data
plotObs <- predictionData %>%
drop_na(paste(response)) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = "resid",
fun = mean) #%>%
#terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
#terra::crop(ext(-1950000, 1000000, -1800000, 1000000))
tempExt <- crds(plotObs, na.rm = TRUE)
plotObs_2 <- plotObs %>%
crop(ext(min(tempExt[,1]), max(tempExt[,1]),
min(tempExt[,2]), max(tempExt[,2]))
)
# identify locations where residuals are >100 or < -100
badResids_high <- predictionData %>%
filter(resid > 100) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
badResids_low <- predictionData %>%
filter(resid < -100) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
# make figures
# make histogram
hist_i <- ggplot(predictionData) +
geom_histogram(aes(resid_globMod), col = "darkgrey", fill = "lightgrey") +
xlab(c("Residuals (obs. - pred.)"))
# make map
map_i <- ggplot() +
geom_spatraster(data = plotObs_2) +
geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
geom_sf(data = badResids_high, col = "blue") +
geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Residuals (obs. - pred.) for predictions of \n", holdoutRegion, " \n from a model fit to other ecoregions"),
subtitle = paste0(response, " ~ ", paste0( modTerms, collapse = " + "))) +
scale_fill_gradient2(low = "red",
mid = "white" ,
high = "blue" ,
midpoint = 0, na.value = "lightgrey",
limits = c(-100, 100))
assign(paste0("residPlot_",holdoutRegion),
value = ggarrange(map_i, hist_i, heights = c(3,1), ncol = 1)
)
}
lapply(unique(modDat_1_s$NA_L2NAME), FUN = function(x) {
get(paste0("residPlot_", x))
})
}
## [1] "ATLANTIC HIGHLANDS"
## [1] "CENTRAL USA PLAINS"
## [1] "EVERGLADES"
## [1] "MARINE WEST COAST FOREST"
## [1] "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"
## [1] "MIXED WOOD PLAINS"
## [1] "MIXED WOOD SHIELD"
## [1] "OZARK/OUACHITA-APPALACHIAN FORESTS"
## [1] "SOUTHEASTERN USA PLAINS"
## [1] "UPPER GILA MOUNTAINS"
## [1] "WESTERN CORDILLERA"
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
# # glm models
# #mods2save <- butcher::butcher(mod_glmFinal) # removes some model components so the saved object isn't huge
#
# #mods2save$formula <- best_form
# #mods2save$pred_vars_inter <- pred_vars_inter # so have interactions
# #n <- nrow(df_sample)
# #mods2save$data_rows <- n
#
#
# if(!test_run) {
# saveRDS(mods2save,
# paste0("./models/glm_beta_model_CONUSwide_", s, "_n", n,
# #sample_group,
# ".RDS"))
# if (byRegion == TRUE) {
# ## western forests
# saveRDS(mods2save_WF,
# paste0("./models/glm_beta_model_WesternForests_", s, "_n", n,
# #sample_group,
# ".RDS"))
# ## eastern forests
# saveRDS(mods2save_EF,
# paste0("./models/glm_beta_model_EasternForests_", s, "_n", n,
# #sample_group,
# ".RDS"))
# ## grass/shrub
# saveRDS(mods2save_G,
# paste0("./models/glm_beta_model_GrassShrub_", s, "_n", n,
# #sample_group,
# ".RDS"))
# }
# }
## partial dependence plots
#vip::vip(mod_glmFinal, num_features = 15)
#pdp_all_vars(mod_glmFinal, mod_vars = pred_vars, ylab = 'probability',train = df_small)
#caret::varImp(fit)
Hash of current commit (i.e. to ID the version of the code used)
system("git rev-parse HEAD", intern=TRUE)
## [1] "79890c55a196d40eb16ae968701c4515b44c260c"
Packages etc.
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.6.1
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Denver
## tzcode source: internal
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggpubr_0.6.0 doMC_1.3.8 iterators_1.0.14 foreach_1.5.2 factoextra_1.0.7 glmnet_4.1-8
## [7] Matrix_1.7-0 kableExtra_1.4.0 rsample_1.2.1 here_1.0.1 StepBeta_2.1.0 ggtext_0.1.2
## [13] knitr_1.49 gridExtra_2.3 pdp_0.8.2 GGally_2.2.1 lubridate_1.9.4 forcats_1.0.0
## [19] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.4 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [25] tidyverse_2.0.0 caret_6.0-94 lattice_0.22-6 ggplot2_3.5.1 sf_1.0-16 tidyterra_0.6.1
## [31] terra_1.8-21 ggspatial_1.1.9 dtplyr_1.3.1 patchwork_1.3.0
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.17.1 jsonlite_1.9.1 shape_1.4.6.1 magrittr_2.0.3
## [6] modeltools_0.2-23 farver_2.1.2 rmarkdown_2.29 vctrs_0.6.5 rstatix_0.7.2
## [11] htmltools_0.5.8.1 curl_6.2.1 broom_1.0.7 Formula_1.2-5 pROC_1.18.5
## [16] sass_0.4.9 parallelly_1.37.1 KernSmooth_2.23-22 bslib_0.9.0 plyr_1.8.9
## [21] sandwich_3.1-0 zoo_1.8-12 cachem_1.1.0 uuid_1.2-1 commonmark_1.9.1
## [26] lifecycle_1.0.4 pkgconfig_2.0.3 R6_2.6.1 fastmap_1.2.0 future_1.33.2
## [31] digest_0.6.37 colorspace_2.1-1 furrr_0.3.1 rprojroot_2.0.4 pkgload_1.3.4
## [36] labeling_0.4.3 timechange_0.3.0 mgcv_1.9-1 httr_1.4.7 abind_1.4-8
## [41] compiler_4.4.0 proxy_0.4-27 aod_1.3.3 withr_3.0.2 backports_1.5.0
## [46] carData_3.0-5 betareg_3.1-4 DBI_1.2.3 ggstats_0.9.0 ggsignif_0.6.4
## [51] MASS_7.3-60.2 lava_1.8.0 rappdirs_0.3.3 classInt_0.4-10 gtools_3.9.5
## [56] ModelMetrics_1.2.2.2 tools_4.4.0 units_0.8-5 lmtest_0.9-40 future.apply_1.11.2
## [61] nnet_7.3-19 glue_1.8.0 nlme_3.1-164 gridtext_0.1.5 grid_4.4.0
## [66] reshape2_1.4.4 generics_0.1.3 recipes_1.1.0 gtable_0.3.6 tzdb_0.4.0
## [71] class_7.3-22 data.table_1.17.0 hms_1.1.3 utf8_1.2.4 xml2_1.3.7
## [76] car_3.1-2 flexmix_2.3-19 markdown_1.13 ggrepel_0.9.5 pillar_1.10.1
## [81] splines_4.4.0 survival_3.5-8 tidyselect_1.2.1 svglite_2.1.3 stats4_4.4.0
## [86] xfun_0.51 hardhat_1.4.0 timeDate_4032.109 stringi_1.8.4 yaml_2.3.10
## [91] evaluate_1.0.3 codetools_0.2-20 cli_3.6.4 rpart_4.1.23 systemfonts_1.2.1
## [96] munsell_0.5.1 jquerylib_0.1.4 Rcpp_1.0.14 globals_0.16.3 gower_1.0.1
## [101] listenv_0.9.1 tigris_2.1 viridisLite_0.4.2 ipred_0.9-15 scales_1.3.0
## [106] prodlim_2024.06.25 e1071_1.7-14 crayon_1.5.3 combinat_0.0-8 rlang_1.1.5
## [111] cowplot_1.1.3